Creation - jeremy.andreoletti@ens.fr - 13/04/2020
# setwd("~/Documents/Scolaire/ENS/Stage_M1/Cetaceans/")
cetacea_pbdb <- read.csv("data-cetaceans/cetacea_pbdb_05_Apr_2018.csv", skip = 16)
summary(cetacea_pbdb)
## occurrence_no record_type reid_no flags collection_no
## Min. : 68135 occ:4476 Min. :11942 Mode:logical Min. : 4868
## 1st Qu.: 540497 1st Qu.:18135 NA's:4476 1st Qu.: 48884
## Median : 725033 Median :21552 Median : 67030
## Mean : 804995 Mean :22595 Mean : 82570
## 3rd Qu.:1021108 3rd Qu.:27547 3rd Qu.: 99584
## Max. :1396622 Max. :34473 Max. :192401
## NA's :4191
## identified_name identified_rank identified_no
## Cetacea indet. : 271 species :2438 Min. : 36652
## Mysticeti indet. : 140 genus : 675 1st Qu.: 42951
## Odontoceti indet. : 124 family : 648 Median : 63219
## Delphinidae indet. : 96 suborder : 297 Mean : 67516
## Balaenopteridae indet.: 92 unranked clade: 287 3rd Qu.: 65078
## Balaena mysticetus : 79 superfamily : 71 Max. :367667
## (Other) :3674 (Other) : 60
## difference accepted_name accepted_rank
## :3557 Cetacea : 294 species :2057
## recombined as : 302 Odontoceti : 240 family : 761
## nomen dubium : 301 Mysticeti : 208 genus : 741
## subjective synonym of: 118 Balaenopteridae : 156 suborder : 471
## invalid subgroup of : 56 Delphinidae : 107 unranked clade: 310
## corrected to : 30 Balaena mysticetus: 84 superfamily : 74
## (Other) : 112 (Other) :3387 (Other) : 62
## accepted_no early_interval late_interval
## Min. : 36652 Holocene : 626 :3967
## 1st Qu.: 42937 Langhian : 355 Late Pliocene : 80
## Median : 62924 Burdigalian : 310 Langhian : 60
## Mean : 64432 Zanclean : 309 Messinian : 59
## 3rd Qu.: 64626 Tortonian : 267 Serravallian : 58
## Max. :367667 Serravallian: 239 Early Pleistocene: 34
## (Other) :2370 (Other) : 218
## max_ma min_ma reference_no
## Min. : 0.0117 Min. : 0.000 Min. : 289
## 1st Qu.: 3.6000 1st Qu.: 2.588 1st Qu.:12813
## Median :11.6200 Median : 5.333 Median :23666
## Mean :13.9433 Mean :10.117 Mean :27322
## 3rd Qu.:20.4400 3rd Qu.:13.820 3rd Qu.:38452
## Max. :66.0000 Max. :47.800 Max. :65107
##
head(cetacea_pbdb, 20)
## occurrence_no record_type reid_no flags collection_no
## 1 68135 occ NA NA 4868
## 2 137494 occ NA NA 11601
## 3 141404 occ NA NA 12121
## 4 147937 occ NA NA 13063
## 5 147938 occ NA NA 13064
## 6 148079 occ NA NA 13078
## 7 148335 occ NA NA 13090
## 8 148353 occ NA NA 13092
## 9 148356 occ NA NA 13096
## 10 148358 occ NA NA 13098
## 11 148360 occ NA NA 13100
## 12 148363 occ NA NA 13102
## 13 148364 occ NA NA 13102
## 14 148365 occ 19615 NA 13103
## 15 150826 occ NA NA 11596
## 16 150827 occ NA NA 13103
## 17 150828 occ NA NA 13402
## 18 150829 occ NA NA 13402
## 19 150830 occ NA NA 13403
## 20 150831 occ NA NA 13403
## identified_name identified_rank identified_no
## 1 n. gen. Georgiacetus n. sp. vogtlensis species 63123
## 2 Argyrocetus joaquinensis species 69897
## 3 n. gen. Kharthlidelphis n. sp. diceros species 53161
## 4 n. gen. Pinocetus n. sp. polonicus species 53140
## 5 n. gen. Basiloterus n. sp. hussaini species 53165
## 6 n. gen. Sachalinocetus n. sp. cholmicus species 63225
## 7 n. gen. Praekogia n. sp. cedrosensis species 53139
## 8 Aulophyseter n. sp. rionegrensis species 53106
## 9 Microcetus n. sp. sharkovi species 53137
## 10 n. gen. Mixocetus n. sp. elysius species 64432
## 11 n. gen. Austrosqualodon n. sp. trirhizodonta species 63212
## 12 Basilosaurus isis species 53287
## 13 Dorudon atrox species 53288
## 14 Dorudon atrox species 53288
## 15 Basilosaurus n. sp. drazindai species 53163
## 16 Basilosaurus isis species 53287
## 17 Basilosaurus isis species 53287
## 18 Dorudon atrox species 53288
## 19 Basilosaurus isis species 53287
## 20 Dorudon atrox species 53288
## difference accepted_name accepted_rank accepted_no
## 1 Georgiacetus vogtlensis species 63123
## 2 Argyrocetus joaquinensis species 69897
## 3 nomen dubium Kharthlidelphis genus 53160
## 4 Pinocetus polonicus species 53140
## 5 Basiloterus hussaini species 53165
## 6 Sachalinocetus cholmicus species 63225
## 7 Praekogia cedrosensis species 53139
## 8 invalid subgroup of Physeteroidea superfamily 53105
## 9 Microcetus sharkovi species 53137
## 10 Mixocetus elysius species 64432
## 11 Austrosqualodon trirhizodonta species 63212
## 12 Basilosaurus isis species 62984
## 13 Dorudon atrox species 62994
## 14 Dorudon atrox species 62994
## 15 Basilosaurus drazindai species 53163
## 16 Basilosaurus isis species 62984
## 17 Basilosaurus isis species 62984
## 18 Dorudon atrox species 62994
## 19 Basilosaurus isis species 62984
## 20 Dorudon atrox species 62994
## early_interval late_interval max_ma min_ma reference_no
## 1 Lutetian 47.800 41.300 289
## 2 Aquitanian 23.030 20.440 4175
## 3 Chattian 28.100 23.030 6018
## 4 Langhian 15.970 13.820 4344
## 5 Bartonian 41.300 38.000 6010
## 6 Early Miocene Middle Miocene 23.030 11.608 4357
## 7 Messinian 7.246 5.333 4361
## 8 Messinian 7.246 5.333 4362
## 9 Chattian 28.100 23.030 4365
## 10 Tortonian 11.620 7.246 10152
## 11 Duntroonian 27.300 25.200 4368
## 12 Priabonian 38.000 33.900 6026
## 13 Priabonian 38.000 33.900 6026
## 14 Priabonian 38.000 33.900 10457
## 15 Bartonian 41.300 38.000 6010
## 16 Priabonian 38.000 33.900 6026
## 17 Priabonian 38.000 33.900 6026
## 18 Priabonian 38.000 33.900 6026
## 19 Priabonian 38.000 33.900 6026
## 20 Priabonian 38.000 33.900 6026
Reorder accepted ranks according to classification standard.
levels(cetacea_pbdb$accepted_rank)
## [1] "family" "genus" "infraorder" "species"
## [5] "subfamily" "suborder" "subspecies" "superfamily"
## [9] "unranked clade"
cetacea_pbdb$accepted_rank <- factor(cetacea_pbdb$accepted_rank, levels=c("subspecies", "species", "genus", "subfamily", "family", "superfamily", "infraorder", "suborder", "unranked clade"))
# Compute the mean age in the time interval
cetacea_pbdb$age_mean <- (cetacea_pbdb$min_ma+cetacea_pbdb$max_ma)/2
ggplot(cetacea_pbdb, aes(x=-age_mean, y=rnorm(length(age_mean), 1, 0.05))) +
geom_point(cex=0.1, alpha=0.5) +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Occurrences", limits = c(0.5, 1.5)) +
scale_fill_discrete(name="Accepted rank") +
theme(panel.background = element_blank(),
axis.line.x = element_line(size = 0.5, linetype = "solid", colour = "black"),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
ggtitle(paste("Repartition of", dim(cetacea_pbdb)[1], "recorded occurrences through time"))
ggplot(cetacea_pbdb, aes(x=-age_mean)) +
geom_histogram(binwidth = 1) +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Number of occurrences") +
scale_fill_discrete(name="Accepted rank") +
theme(panel.grid.major.y = element_blank(),
legend.position="bottom") +
ggtitle("Evolution of the number occurrences through time")
\(\to\) Numerous occurrences seem to have the same age interval so in order to avoid clusters let’s draw them uniformly in their stratigraphic range rather than taking the mean.
# Draw occurrence age uniformally in the time interval
cetacea_pbdb$age_runif <- mapply(function(m, M){runif(n=1, min=m, max=M)}, cetacea_pbdb$min_ma, cetacea_pbdb$max_ma)
ggplot(cetacea_pbdb, aes(x=-age_runif, y=rnorm(length(age_runif), 1, 0.05))) +
geom_point(cex=0.1, alpha=0.5) +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Occurrences", limits = c(0.5, 1.5)) +
scale_fill_discrete(name="Accepted rank") +
theme(panel.background = element_blank(),
axis.line.x = element_line(size = 0.5, linetype = "solid", colour = "black"),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
ggtitle(paste("Repartition of", dim(cetacea_pbdb)[1], "recorded occurrences through time"))
ggplot(cetacea_pbdb, aes(x=-age_runif)) +
geom_histogram(binwidth = 1) +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Number of occurrences") +
scale_fill_discrete(name="Accepted rank") +
theme(panel.grid.major.y = element_blank(),
legend.position="bottom") +
ggtitle("Evolution of the number occurrences through time")
\(\to\) The repartition seems much smoother now.
These occurrences are too numerous for our current implementation, let’s subsample a fraction of them for now.
cetacea_pbdb_subsampled <- sample_n(cetacea_pbdb, size=round(length(age_runif)*0.10))
ggplot(cetacea_pbdb_subsampled, aes(x=-age_runif, y=rnorm(length(age_runif), 1, 0.05))) +
geom_point(cex=0.5, alpha=0.5) +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Occurrences", limits = c(0.5, 1.5)) +
scale_fill_discrete(name="Accepted rank") +
theme(panel.background = element_blank(),
axis.line.x = element_line(size = 0.5, linetype = "solid", colour = "black"),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
ggtitle(paste("Repartition of", dim(cetacea_pbdb_subsampled)[1], "recorded occurrences through time"))
ggplot(cetacea_pbdb_subsampled, aes(x=-age_runif)) +
geom_histogram(binwidth = 1) +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Number of occurrences") +
scale_fill_discrete(name="Accepted rank") +
theme(panel.grid.major.y = element_blank(),
legend.position="bottom") +
ggtitle("Evolution of the number occurrences through time")
\(\to\) The distribution looks similar, with some noise due to higher variance with smaller sample.
write(cetacea_pbdb$age_mean, file = "data-cetaceans/Cetacea_occurrences_mean_age.csv", sep="; ", ncolumns = dim(cetacea_pbdb)[1])
write(cetacea_pbdb$age_runif, file = "data-cetaceans/Cetacea_age_runif_age.csv", sep="; ", ncolumns = dim(cetacea_pbdb)[1])
write(cetacea_pbdb_subsampled$age_runif, file = "data-cetaceans/Cetacea_age_runif_age_subsampled.csv", sep="; ", ncolumns = dim(cetacea_pbdb_subsampled)[1])
ggplot(cetacea_pbdb, aes(x=factor(record_type), fill=accepted_rank)) +
geom_bar(width = 1, stat = "count") +
coord_polar("y") +
scale_fill_discrete(name="Accepted rank") +
labs(x = NULL, y = NULL, fill = NULL, title = "Repartition of occurrences among accepted ranks")
\(\to\) Half of the occurrences are identified at the level of the secies and 1/3 at the genus or family. Very few occurences for subspecies/infraorder, maybe fuse with species and suborder or remove ?
ggplot(cetacea_pbdb, aes(x=-age_runif, fill=accepted_rank)) +
geom_histogram(binwidth = 1) +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Number of occurrences") +
scale_fill_discrete(name="Accepted rank") +
theme(panel.grid.major.y = element_blank()) +
ggtitle("Evolution of the number occurrences through time, by accepted rank") +
facet_grid(accepted_rank ~ ., scales="free")
\(\to\) Similar trends with peaks at ~15My and ~5My and a lot of them around 0 (artefact ?).
ggplot(cetacea_pbdb, aes(x=reorder(accepted_name, table(accepted_name)[accepted_name]), fill=accepted_rank)) +
geom_bar(stat="count") +
scale_x_discrete(name = "Accepted names") +
scale_y_continuous(trans = "log10", name = "Number of corresponding occurrences (log-scale)") +
scale_fill_discrete(name="Accepted rank") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
panel.grid.major.x = element_blank(),
legend.position="bottom") +
ggtitle("Distributions of the number occurrences with the same accepted name, by accepted rank") +
facet_grid(. ~ accepted_rank, scales="free_x")
\(\to\) ~Half of species/genera/subfamilies have only one specimen by acepted name, but it could go up to ~50 within the same species and ~200 occurrences within the same suborder. Those differences will have to be corrected because in our model all species are upposed to have the same abundance (identical sampling rates among branches).
\(\implies\) Our goal now will be to correct this abundance bias.
ggplot(cetacea_pbdb, aes(x=reorder(late_interval, -max_ma, median), group = -max_ma, fill=-max_ma)) +
geom_bar(stat="count") +
scale_x_discrete(name = "Late Stratigraphic Limit") +
scale_y_continuous(name = "Number of corresponding occurrences") +
scale_fill_continuous(name = "Maximum Age (My)") +
ggtitle("Distribution of maximum stratigraphic ages") +
theme(panel.grid.major.x = element_blank(),
legend.position="bottom",
axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(cetacea_pbdb, aes(x=reorder(early_interval, -min_ma, median), group = -min_ma, fill=-min_ma)) +
geom_bar(stat="count") +
scale_x_discrete(name = "Early Stratigraphic Limit") +
scale_y_continuous(name = "Number of corresponding occurrences") +
scale_fill_continuous(name = "Minimum Age (My)") +
ggtitle("Distribution of minimum stratigraphic ages") +
theme(panel.grid.major.x = element_blank(),
legend.position="bottom",
axis.text.x = element_text(angle = 90, hjust = 1))
\(\to\) Most species have a early but not a late stratigraphic limit.
# Get the stratigraphic interval of time uncertainty
cetacea_pbdb$age_interval <- mapply(function(min_age, max_age){paste("[",-max_age,", ",-min_age,"]", sep = "")}, cetacea_pbdb$min_ma, cetacea_pbdb$max_ma)
ggplot(cetacea_pbdb, aes(x=-max_ma, y=age_interval)) +
geom_segment(aes(x = -max_ma, y = reorder(age_interval, -age_mean), xend = -min_ma, yend=reorder(age_interval, -age_mean), color=-age_mean)) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Age intervals") +
labs(colour="Mean age") +
ggtitle("Distribution of age intervals") +
theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
panel.grid.major.y = element_blank())
# Count the number of occurrences with the same age interval
cetacea_pbdb$age_interval_count <- sapply(cetacea_pbdb$age_interval, function(x){table(cetacea_pbdb$age_interval)[x]})
my_breaks <- c(1,3,10,40,150,max(cetacea_pbdb$age_interval_count))
ggplot(cetacea_pbdb, aes(x=-max_ma, y=age_interval)) +
geom_segment(aes(x = -max_ma, y = reorder(age_interval, -age_mean), xend = -min_ma, yend=reorder(age_interval, -age_mean), color=age_interval_count)) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Age intervals") +
labs(colour="Age interval count") +
scale_color_gradientn(colours = c("skyblue", "coral", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
panel.grid.major.y = element_blank(),
legend.position = "bottom") +
ggtitle("Distributions of fossil range ages, by accepted rank") +
facet_grid(. ~ accepted_rank)
# Count the number of occurrences with the same accepted name
cetacea_pbdb$accepted_name_count <- sapply(cetacea_pbdb$accepted_name, function(x){table(cetacea_pbdb$accepted_name)[x]})
my_breaks <- c(1,3,10,35,100,max(cetacea_pbdb$accepted_name_count))
ggplot(cetacea_pbdb[!(cetacea_pbdb$accepted_rank %in% c("species", "genus", "family")),], aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_ma, y = reorder(accepted_name, -age_mean), xend = -min_ma, yend=reorder(accepted_name, -age_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.3) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Accepted ranks") +
labs(colour="Accepted names count") +
theme(panel.grid.major.y = element_blank()) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle("Distributions of fossil range ages, by accepted rank") +
facet_grid(accepted_rank ~ ., scales = "free")
ggplot(cetacea_pbdb[cetacea_pbdb$accepted_rank =="species",], aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_ma, y = reorder(accepted_name, -age_mean), xend = -min_ma, yend=reorder(accepted_name, -age_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.3) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Age ranges") +
labs(colour="Accepted names count") +
theme(panel.grid.major.y = element_blank(),
axis.text.y=element_text(size=2)) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle("Distributions of fossil range ages (species)") +
facet_grid(accepted_rank ~ ., scales = "free")
ggplot(cetacea_pbdb[cetacea_pbdb$accepted_rank =="genus",], aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_ma, y = reorder(accepted_name, -age_mean), xend = -min_ma, yend=reorder(accepted_name, -age_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.3) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Genus") +
labs(colour="Accepted names count") +
theme(panel.grid.major.y = element_blank(),
axis.text.y=element_text(size=6)) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle("Distributions of fossil range ages (genera)") +
facet_grid(accepted_rank ~ ., scales = "free")
ggplot(cetacea_pbdb[cetacea_pbdb$accepted_rank =="family",], aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_ma, y = reorder(accepted_name, -age_mean), xend = -min_ma, yend=reorder(accepted_name, -age_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.3) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Genus") +
labs(colour="Accepted names count") +
theme(panel.grid.major.y = element_blank()) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle("Distributions of fossil range ages (families)") +
facet_grid(accepted_rank ~ ., scales = "free")
\(\to\) Some occurrences have too much age uncertainty, they could be removed because they are not very informative.
# Get the duration of the time intervals
cetacea_pbdb$age_range <- cetacea_pbdb$max_ma-cetacea_pbdb$min_ma
# Occurrences whose time range is lower than 10 million years
dim(cetacea_pbdb[cetacea_pbdb$age_range<10,])
## [1] 4157 23
ggplot(cetacea_pbdb, aes(x=age_range, fill=accepted_rank)) +
geom_histogram(binwidth = 1) +
scale_x_continuous(name = "Age uncertainty range (My)") +
scale_y_continuous(name = "Number of corresponding occurrences") +
geom_segment(aes(x = 10, y = 0, xend = 10, yend=1050), color="black", linetype="dashed") +
ggtitle("Distribution of occurrences' age uncertainty range")
Most of occurrences (4157) show less than 10 My age uncertainty, let’s keep only these ones.
my_breaks <- c(1,3,10,35,100,max(cetacea_pbdb$accepted_name_count))
ggplot(cetacea_pbdb[!(cetacea_pbdb$accepted_rank %in% c("species", "genus", "family")) & cetacea_pbdb$age_range<10,], aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_ma, y = reorder(accepted_name, -age_mean), xend = -min_ma, yend=reorder(accepted_name, -age_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.5) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Accepted ranks") +
labs(colour="Accepted names count") +
theme(panel.grid.major.y = element_blank()) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle("Distributions of fossil range ages, by accepted rank") +
facet_grid(accepted_rank ~ ., scales = "free")
ggplot(cetacea_pbdb[cetacea_pbdb$accepted_rank =="species" & cetacea_pbdb$age_range<10,], aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_ma, y = reorder(accepted_name, -age_mean), xend = -min_ma, yend=reorder(accepted_name, -age_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.5) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Age ranges") +
labs(colour="Accepted names count") +
theme(panel.grid.major.y = element_blank(),
axis.text.y=element_text(size=2)) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle("Distributions of fossil range ages (species)") +
facet_grid(accepted_rank ~ ., scales = "free")
ggplot(cetacea_pbdb[cetacea_pbdb$accepted_rank =="genus" & cetacea_pbdb$age_range<10,], aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_ma, y = reorder(accepted_name, -age_mean), xend = -min_ma, yend=reorder(accepted_name, -age_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.5) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Genus") +
labs(colour="Accepted names count") +
theme(panel.grid.major.y = element_blank(),
axis.text.y=element_text(size=6)) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle("Distributions of fossil range ages (genera)") +
facet_grid(accepted_rank ~ ., scales = "free")
ggplot(cetacea_pbdb[cetacea_pbdb$accepted_rank =="family" & cetacea_pbdb$age_range<10,], aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_ma, y = reorder(accepted_name, -age_mean), xend = -min_ma, yend=reorder(accepted_name, -age_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.5) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Genus") +
labs(colour="Accepted names count") +
theme(panel.grid.major.y = element_blank()) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle("Distributions of fossil range ages (families)") +
facet_grid(accepted_rank ~ ., scales = "free")
\(\to\) Some species (or other ranks) have several occurrences with several time ranges, let’s combine them into a unique range covering all the other.
# Get the minimun of the minimal ages for each accepted name
cetacea_pbdb$min_min_ma <- apply(cetacea_pbdb, 1, function(X){min(cetacea_pbdb[cetacea_pbdb$accepted_name==X["accepted_name"] & cetacea_pbdb$age_range<10,]$min_ma)})
# Get the maximum of the maximal ages for each accepted name
cetacea_pbdb$max_max_ma <- apply(cetacea_pbdb, 1, function(X){max(cetacea_pbdb[cetacea_pbdb$accepted_name==X["accepted_name"] & cetacea_pbdb$age_range<10,]$max_ma)})
# Get the combined time interval
cetacea_pbdb$age_combined_interval <- mapply(function(min_min_age, max_max_age){paste("[",-max_max_age,", ",-min_min_age,"]", sep = "")}, cetacea_pbdb$min_min_ma, cetacea_pbdb$max_max_ma)
# Get the combined time interval mean
cetacea_pbdb$age_combined_mean <- (cetacea_pbdb$min_min_ma + cetacea_pbdb$max_max_ma)/2
# Get the duration of the combined time intervals
cetacea_pbdb$age_combined_range <- cetacea_pbdb$max_max_ma-cetacea_pbdb$min_min_ma
ggplot(cetacea_pbdb[!(cetacea_pbdb$accepted_rank %in% c("species", "genus", "family")) & cetacea_pbdb$age_range<10,], aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_max_ma, y = reorder(accepted_name, -age_combined_mean), xend = -min_min_ma, yend=reorder(accepted_name, -age_combined_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.5) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Accepted ranks") +
labs(colour="Accepted names count") +
theme(panel.grid.major.y = element_blank()) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle("Distributions of fossil combined range ages, by accepted rank") +
facet_grid(accepted_rank ~ ., scales = "free")
ggplot(cetacea_pbdb[cetacea_pbdb$accepted_rank =="species" & cetacea_pbdb$age_range<10,], aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_max_ma, y = reorder(accepted_name, -age_combined_mean), xend = -min_min_ma, yend=reorder(accepted_name, -age_combined_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.5) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Age ranges") +
labs(colour="Accepted names count") +
theme(panel.grid.major.y = element_blank(),
axis.text.y=element_text(size=2)) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle("Distributions of fossil combined range ages (species)") +
facet_grid(accepted_rank ~ ., scales = "free")
ggplot(cetacea_pbdb[cetacea_pbdb$accepted_rank =="genus" & cetacea_pbdb$age_range<10,], aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_max_ma, y = reorder(accepted_name, -age_combined_mean), xend = -min_min_ma, yend=reorder(accepted_name, -age_combined_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.5) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Genera") +
labs(colour="Accepted names count") +
theme(panel.grid.major.y = element_blank(),
axis.text.y=element_text(size=6)) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle("Distributions of fossil combined range ages (genera)") +
facet_grid(accepted_rank ~ ., scales = "free")
ggplot(cetacea_pbdb[cetacea_pbdb$accepted_rank =="family" & cetacea_pbdb$age_range<10,], aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_max_ma, y = reorder(accepted_name, -age_combined_mean), xend = -min_min_ma, yend=reorder(accepted_name, -age_combined_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.5) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Families") +
labs(colour="Accepted names count") +
theme(panel.grid.major.y = element_blank()) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle("Distributions of fossil combined range ages (families)") +
facet_grid(accepted_rank ~ ., scales = "free")
ggplot(cetacea_pbdb[cetacea_pbdb$age_range<10,], aes(x=accepted_name_count/age_range, fill=factor(accepted_name_count))) +
geom_histogram(binwidth=0.1) +
scale_x_continuous(trans="log10", name="Density of occurrences (number/My)") +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle("Distributions occurrence density, by accepted rank") +
facet_grid(accepted_rank ~ .)
\(\to\) Density logically increases as taxa ranks increase, but there is a cluster of high density within the species. Let’s identify its origin :
# Get interval density of occurrences
cetacea_pbdb$age_interval_density <- cetacea_pbdb$accepted_name_count/cetacea_pbdb$age_range
ggplot(cetacea_pbdb[cetacea_pbdb$age_range<10,], aes(x=age_interval_density, fill=factor(max_ma))) +
geom_histogram(binwidth=0.1) +
scale_x_continuous(trans="log10", name="Density of occurrences (number/My)") +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle("Distributions occurrence density, by accepted rank") +
facet_grid(accepted_rank ~ .)
ggplot(cetacea_pbdb[cetacea_pbdb$age_range<10,], aes(x=age_interval_density, fill=factor(age_range))) +
geom_histogram(binwidth=0.1) +
scale_x_continuous(trans="log10", name="Density of occurrences (number/My)") +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle("Distributions occurrence density, by accepted rank") +
facet_grid(accepted_rank ~ .)
\(\to\) This cluster corresponds to recent samples (< 150.000 years) with very precise datation (different technique ?). Let’s try to remove it but later it could be interesting to subsample instead.
ggplot(cetacea_pbdb[!(cetacea_pbdb$max_ma %in% c(0.0117, 0.126)) & cetacea_pbdb$age_range<10,], aes(x=accepted_name_count/age_range, fill=factor(accepted_name_count))) +
geom_histogram(binwidth=0.1) +
scale_x_continuous(trans="log10", name="Density of occurrences (number/My)") +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle("Distributions occurrence density, by accepted rank") +
facet_grid(accepted_rank ~ .)
\(\to\) The distributions look normal again.
Compare the number of occurrences through time with or without the concentrated ones at present
ggplot(cetacea_pbdb, aes(x=-age_runif, fill=accepted_rank)) +
geom_histogram(binwidth = 0.2) +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Number of occurrences") +
scale_fill_discrete(name="Accepted rank") +
theme(panel.grid.major.y = element_blank()) +
ggtitle("Evolution of the number occurrences through time, by accepted rank") +
facet_grid(accepted_rank ~ ., scales="free")
ggplot(cetacea_pbdb[!(cetacea_pbdb$max_ma %in% c(0.0117, 0.126)),], aes(x=-age_runif, fill=accepted_rank)) +
geom_histogram(binwidth = 0.2) +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Number of occurrences") +
scale_fill_discrete(name="Accepted rank") +
theme(panel.grid.major.y = element_blank()) +
ggtitle("Evolution of the number occurrences through time, by accepted rank") +
facet_grid(accepted_rank ~ ., scales="free")
\(\to\) The removal gives much clearer occurrence distributions at the species and genera levels.
If we want to correct species abundance differences based on the number of occurrences in the time range (“density”), those factors should not depend on time in order to avoid penalizing periods with bigger ranges.
# Function to plot linear regression coefficients
lm_eqn <- function(df){
m <- lm(y ~ x, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
conditions <- cetacea_pbdb$accepted_rank =="species" & cetacea_pbdb$age_range<10
# Keep only one point by identical time interval
age_interval_table <- table(cetacea_pbdb[conditions,]$age_interval)
interval_mean_age <- sapply(names(age_interval_table), function(x){cetacea_pbdb[conditions & cetacea_pbdb$age_interval==x,]$age_mean[1]})
interval_range <- sapply(names(age_interval_table), function(x){cetacea_pbdb[conditions & cetacea_pbdb$age_interval==x,]$age_range[1]})
df <- data.frame(x=interval_mean_age,
y=interval_range)
ggplot(df, aes(x=x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(formula = y~x, method=lm) +
geom_smooth(formula = y~x, method=lm) +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Uncertainty range") +
ggtitle("Correlation between time range and age") +
geom_text(x = 30, y = 9, label = lm_eqn(df), parse = TRUE)
# Keep only one point by identical combined time interval
age_combined_interval_table <- table(cetacea_pbdb[conditions,]$age_combined_interval)
combined_interval_mean_age <- sapply(names(age_combined_interval_table), function(x){cetacea_pbdb[conditions & cetacea_pbdb$age_combined_interval==x,]$age_combined_mean[1]})
combined_interval_range <- sapply(names(age_combined_interval_table), function(x){cetacea_pbdb[conditions & cetacea_pbdb$age_combined_interval==x,]$age_combined_range[1]})
combined_df <- data.frame(x=combined_interval_mean_age,
y=combined_interval_range)
ggplot(combined_df, aes(x=x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(formula = y~x, method=lm) +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Combined uncertainty range") +
ggtitle("Correlation between combined time range and age") +
geom_text(x = 30, y = 15, label = lm_eqn(combined_df), parse = TRUE)
\(\to\) It seems that age range in much less correlated with time when we take the full combined range into account. However this plot is quite ugly (“triangle” instead of a nice point cloud) so this correlation may not be very meaningful. Nevertheless, we will use those ranges for normalising the occurrence density because we don’t want to penalize older specimens.
Let’s look at the density directly, because this is what is interesting us directly.
conditions <- cetacea_pbdb$accepted_rank =="species" & cetacea_pbdb$age_range<10 & !(cetacea_pbdb$max_ma %in% c(0.0117, 0.126))
# Keep only one point by identical time interval
age_interval_table <- table(cetacea_pbdb[conditions,]$age_interval)
interval_mean_age <- sapply(names(age_interval_table), function(x){cetacea_pbdb[conditions & cetacea_pbdb$age_interval==x,]$age_mean[1]})
interval_density <- sapply(names(age_interval_table), function(x){cetacea_pbdb[conditions & cetacea_pbdb$age_interval==x,]$age_interval_density[1]})
df_density <- data.frame(x=interval_mean_age,
y=interval_density)
ggplot(df_density, aes(x=x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(formula = y~x, method=lm) +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Combined density") +
ggtitle("Correlation between density and age") +
geom_text(x = 30, y = 15, label = lm_eqn(df_density), parse = TRUE)
# Keep only one point by identical combined time interval
age_combined_interval_table <- table(cetacea_pbdb[conditions,]$age_combined_interval)
combined_interval_mean_age <- sapply(names(age_combined_interval_table), function(x){cetacea_pbdb[conditions & cetacea_pbdb$age_combined_interval==x,]$age_combined_mean[1]})
cetacea_pbdb$age_combined_interval_density <- cetacea_pbdb$accepted_name_count/cetacea_pbdb$age_combined_range
combined_interval_density <- sapply(names(age_combined_interval_table), function(x){cetacea_pbdb[conditions & cetacea_pbdb$age_combined_interval==x,]$age_combined_interval_density[1]})
combined_df_density <- data.frame(x=combined_interval_mean_age,
y=combined_interval_density)
ggplot(combined_df_density, aes(x=x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(formula = y~x, method=lm) +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Combined density") +
ggtitle("Correlation between combined density and age") +
geom_text(x = 30, y = 15, label = lm_eqn(combined_df_density), parse = TRUE)
\(\to\) Again the combined version is less correlated with time.
ggplot(cetacea_pbdb[cetacea_pbdb$age_range<10,], aes(x=accepted_name_count/age_range, fill=factor(accepted_name_count))) +
geom_histogram(binwidth=0.1) +
scale_x_continuous(trans="log10", name="Density of occurrences (number/My)") +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle("Distributions of occurrence density, by accepted rank") +
facet_grid(accepted_rank ~ .)
ggplot(cetacea_pbdb[cetacea_pbdb$age_range<10,], aes(x=age_combined_interval_density, fill=factor(accepted_name_count))) +
geom_histogram(binwidth=0.05) +
scale_x_continuous(trans="log10", name="Combined density of occurrences (number/My)") +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle("Distributions of combined occurrence density, by accepted rank") +
facet_grid(accepted_rank ~ .)
ggplot(cetacea_pbdb[cetacea_pbdb$age_range<10 & !(cetacea_pbdb$max_ma %in% c(0.0117, 0.126)),], aes(x=age_combined_interval_density, fill=factor(accepted_name_count))) +
geom_histogram(binwidth=0.05) +
scale_x_continuous(trans="log10", name="Combined density of occurrences (number/My)") +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle("Distributions of combined occurrence density (without recent samples), by accepted rank") +
facet_grid(accepted_rank ~ .)
\(\to\) Densities are smaller and more concentrated with the combined ranges (larger time span + less ranges in total because of combination).
Let’s focus now on the occurrences accepted at the species level because they are the one for which we can correct the abundance bias by subsampling the most concentrated combined intervals.
conditions <- cetacea_pbdb$accepted_rank =="species" & cetacea_pbdb$age_range<10
ggplot(cetacea_pbdb[conditions,], aes(x=accepted_name_count/age_range, fill=factor(accepted_name_count))) +
geom_histogram(binwidth=0.1) +
scale_x_continuous(trans="log10", name="Density of occurrences (number/My)") +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle("Distributions of occurrence density among species") +
facet_grid(accepted_name_count ~ .)
ggplot(cetacea_pbdb[conditions,], aes(x=age_combined_interval_density, fill=factor(accepted_name_count))) +
geom_histogram(binwidth=0.05) +
scale_x_continuous(trans="log10", name="Combined density of occurrences (number/My)") +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle("Distributions of combined occurrence density among species") +
facet_grid(accepted_name_count ~ .)
ggplot(cetacea_pbdb[conditions & !(cetacea_pbdb$max_ma %in% c(0.0117, 0.126)),], aes(x=age_combined_interval_density, fill=factor(accepted_name_count))) +
geom_histogram(binwidth=0.05) +
scale_x_continuous(trans="log10", name="Combined density of occurrences (number/My)") +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle("Distributions of combined occurrence density among species (without recent samples)") +
facet_grid(accepted_name_count ~ ., scales = "free")
\(\to\) Their is a huge span of densities driven by the number of occurrences for the same species that we can reduce by subsampling the most concentrated intervals.
conditions <- cetacea_pbdb$accepted_rank =="species" & cetacea_pbdb$age_range<10
ggplot(cetacea_pbdb[conditions,], aes(x=age_combined_interval_density, fill=factor(accepted_name_count))) +
geom_histogram(binwidth=0.05) +
scale_x_continuous(trans="log10", name="Combined density of occurrences (number/My)") +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle(paste("Distributions of combined occurrence density among species (n = ", dim(cetacea_pbdb[conditions,])[1], ")", sep="")) +
facet_grid(accepted_name_count ~ ., scales = "free")
# Add at least one occurrence of each species
cetacea_pbdb_sampled <- c()
cetacea_pbdb_notsampled <- c()
for (i in as.numeric(row.names(cetacea_pbdb[conditions,]))){
row <- cetacea_pbdb[as.character(i),]
if (!(row$accepted_name %in% cetacea_pbdb_sampled$accepted_name)){
cetacea_pbdb_sampled <- rbind(cetacea_pbdb_sampled, row)
}else{
cetacea_pbdb_notsampled <- rbind(cetacea_pbdb_notsampled, row)
}
}
# Add occurrences in the remaning ones, by underweighting dense species
cetacea_pbdb_sampled <- rbind(cetacea_pbdb_sampled,
sample_n(cetacea_pbdb_notsampled,
size=1000-dim(cetacea_pbdb_sampled)[1],
weight=1/cetacea_pbdb_notsampled$age_combined_interval_density**2))
cetacea_pbdb_sampled$min_min_ma <- apply(cetacea_pbdb_sampled, 1, function(X){min(cetacea_pbdb_sampled[cetacea_pbdb_sampled$accepted_name==X["accepted_name"],]$min_ma)})
cetacea_pbdb_sampled$max_max_ma <- apply(cetacea_pbdb_sampled, 1, function(X){max(cetacea_pbdb_sampled[cetacea_pbdb_sampled$accepted_name==X["accepted_name"],]$max_ma)})
cetacea_pbdb_sampled$age_combined_range <- cetacea_pbdb_sampled$max_max_ma-cetacea_pbdb_sampled$min_min_ma
cetacea_pbdb_sampled$accepted_name_count <- sapply(cetacea_pbdb_sampled$accepted_name, function(x){table(cetacea_pbdb_sampled$accepted_name)[x]})
cetacea_pbdb_sampled$age_combined_interval_density <- cetacea_pbdb_sampled$accepted_name_count/cetacea_pbdb_sampled$age_combined_range
ggplot(cetacea_pbdb_sampled, aes(x=age_combined_interval_density, fill=factor(accepted_name_count))) +
geom_histogram(binwidth=0.05) +
scale_x_continuous(trans="log10", name="Combined density of occurrences (number/My)", limits=c(0.1, 1000)) +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle(paste("Distributions of combined occurrence density among species, after sampling (n = ", dim(cetacea_pbdb_sampled)[1], ")", sep="")) +
facet_grid(accepted_name_count ~ ., scales = "free")
## Warning: Removed 18 rows containing missing values (geom_bar).
\(\to\) Subsampling successfully reduces the density span from 2 to 1 order of magnitude, apart from the artefactual recent samples that we can hide :
conditions <- cetacea_pbdb$accepted_rank =="species" & cetacea_pbdb$age_range<10
ggplot(cetacea_pbdb[conditions & !(cetacea_pbdb$max_ma %in% c(0.0117, 0.126)),], aes(x=age_combined_interval_density, fill=factor(accepted_name_count))) +
geom_histogram(binwidth=0.05) +
scale_x_continuous(trans="log10", name="Combined density of occurrences (number/My)") +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle(paste("Distributions of combined occurrence density among species, without recent samples (n = ", dim(cetacea_pbdb[conditions & !(cetacea_pbdb$max_ma %in% c(0.0117, 0.126)),])[1], ")", sep="")) +
facet_grid(accepted_name_count ~ ., scales = "free")
ggplot(cetacea_pbdb_sampled[!(cetacea_pbdb_sampled$max_ma %in% c(0.0117, 0.126)),], aes(x=age_combined_interval_density, fill=factor(accepted_name_count))) +
geom_histogram(binwidth=0.05) +
scale_x_continuous(trans="log10", name="Combined density of occurrences (number/My)", limits=range(cetacea_pbdb[conditions & !(cetacea_pbdb$max_ma %in% c(0.0117, 0.126)),]$age_combined_interval_density)) +
scale_fill_discrete(name="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
ggtitle(paste("Distributions of combined occurrence density among species, without recent samples (n = ", dim(cetacea_pbdb_sampled[!(cetacea_pbdb_sampled$max_ma %in% c(0.0117, 0.126)),])[1], ")", sep="")) +
facet_grid(accepted_name_count ~ ., scales = "free")
## Warning: Removed 18 rows containing missing values (geom_bar).
See what our distributions
ggplot(cetacea_pbdb[cetacea_pbdb$accepted_rank =="species" & cetacea_pbdb$age_range<10,], aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_max_ma, y = reorder(accepted_name, -age_combined_mean), xend = -min_min_ma, yend=reorder(accepted_name, -age_combined_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.5) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Species") +
labs(colour="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle(paste("Distributions of species fossil range ages, before correcting sampling (n = ", dim(cetacea_pbdb[cetacea_pbdb$accepted_rank =="species" & cetacea_pbdb$age_range<10,])[1], ")", sep="")) +
facet_grid(accepted_rank ~ ., scales = "free")
ggplot(cetacea_pbdb_sampled, aes(x=-max_ma, y=accepted_name)) +
geom_segment(aes(x = -max_max_ma, y = reorder(accepted_name, -age_combined_mean), xend = -min_min_ma, yend=reorder(accepted_name, -age_combined_mean), color=accepted_name_count), arrow=arrow(ends = "both", angle=90, length = unit(1, "mm")), alpha=0.5) +
geom_point(aes(x=-age_runif), col="red", cex=1, alpha=0.5) +
scale_x_continuous(name = "Time (My)") +
scale_y_discrete(name = "Species") +
labs(colour="Accepted name count") +
theme(panel.grid.major.y = element_blank()) +
scale_color_gradientn(colours = c("darkblue", "forestgreen", "black"), trans="log", breaks=my_breaks, labels=my_breaks) +
ggtitle(paste("Distributions of species fossil range ages, after correcting sampling (n = ", dim(cetacea_pbdb_sampled)[1], ")", sep="")) +
facet_grid(accepted_rank ~ ., scales = "free")
\(\to\) Some highly dense cluster became much more similar to the others.
conditions <- cetacea_pbdb$accepted_rank =="species" & cetacea_pbdb$age_range<10
ggplot(cetacea_pbdb[conditions,], aes(x=-age_runif)) +
geom_histogram(binwidth = 0.2, alpha=0.5, fill="red") +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Number of occurrences") +
scale_fill_discrete(name="Accepted rank") +
theme(panel.grid.major.y = element_blank()) +
ggtitle(paste("Evolution of the species occurrence number through time (n = ", dim(cetacea_pbdb[conditions,])[1], ")", sep=""))
ggplot(cetacea_pbdb_sampled, aes(x=-age_runif)) +
geom_histogram(binwidth = 0.2, alpha=0.5, fill="blue") +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Number of occurrences") +
scale_fill_discrete(name="Accepted rank") +
theme(panel.grid.major.y = element_blank()) +
ggtitle(paste("Evolution of the species occurrence number through time, after correction sampling (n = ", dim(cetacea_pbdb_sampled)[1], ")", sep=""))
conditions <- cetacea_pbdb$accepted_rank =="species" & cetacea_pbdb$age_range<10
ggplot(cetacea_pbdb[conditions & !(cetacea_pbdb$max_ma %in% c(0.0117, 0.126)),], aes(x=-age_runif)) +
geom_histogram(binwidth = 0.2, alpha=0.5, fill="red") +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Number of occurrences") +
scale_fill_discrete(name="Accepted rank") +
theme(panel.grid.major.y = element_blank()) +
ggtitle(paste("Evolution of the species occurrence number through time, without recent samples (n = ", dim(cetacea_pbdb[conditions & !(cetacea_pbdb$max_ma %in% c(0.0117, 0.126)),])[1], ")", sep=""))
ggplot(cetacea_pbdb_sampled[!(cetacea_pbdb_sampled$max_ma %in% c(0.0117, 0.126)),], aes(x=-age_runif)) +
geom_histogram(binwidth = 0.2, alpha=0.5, fill="blue") +
scale_x_continuous(name = "Time (My)") +
scale_y_continuous(name = "Number of occurrences") +
scale_fill_discrete(name="Accepted rank") +
theme(panel.grid.major.y = element_blank()) +
ggtitle(paste("Evolution of the species occurrence number through time, without recent samples and after correction sampling (n = ", dim(cetacea_pbdb_sampled[!(cetacea_pbdb_sampled$max_ma %in% c(0.0117, 0.126)),])[1], ")", sep=""))
If we superpose these 2 plots :
conditions <- cetacea_pbdb$accepted_rank =="species" & cetacea_pbdb$age_range<10
p1 <- ggplot(cetacea_pbdb[conditions & !(cetacea_pbdb$max_ma %in% c(0.0117, 0.126)),], aes(x=-age_runif)) +
geom_histogram(binwidth = 0.2, alpha=0.5, fill="red") +
theme_void()
p2 <- ggplot(cetacea_pbdb_sampled[!(cetacea_pbdb_sampled$max_ma %in% c(0.0117, 0.126)),], aes(x=-age_runif)) +
geom_histogram(binwidth = 0.2, alpha=0.5, fill="blue") +
theme_void()
p1 + annotation_custom(ggplotGrob(p2))
\(\to\) We get the new species occurrence repartition after subsampling correction, that could be used for doing inference with the occurrence birth-death model.
Achievements :
Open questions :